Time series clustering is to partition time series data into groups based on similarity or distance, so that time series in the same cluster are similar.
Methodology followed:
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
from vrae.vrae import VRAE
from vrae.utils import *
import numpy as np
import torch
import pickle
import plotly
from torch.utils.data import DataLoader, TensorDataset
plotly.offline.init_notebook_mode()
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
%load_ext autoreload
%autoreload 2
dload = './model_dir'
seq_len = 10
hidden_size = 256
hidden_layer_depth = 3
latent_length = 32
batch_size = 32
learning_rate = 0.00002
n_epochs = 2000
dropout_rate = 0.0
optimizer = 'Adam' # options: ADAM, SGD
cuda = True # options: True, False
print_every=10
clip = True # options: True, False
max_grad_norm=5
loss = 'MSELoss' # options: SmoothL1Loss, MSELoss
block = 'LSTM' # options: LSTM, GRU
#X_train, X_val, y_train, y_val = open_data('data', ratio_train=1.0, dataset='EMG', filename = "EMGe5_len10")
do_pca = False
single_channel = 5
X_train, y_train = open_data_nosplit('data', dataset='EMG', filename = "EMGe57_len10")
# print(f'training dataset shape: {X_train.shape}')
# Data is sequential, so we need to reshape it to add a time dimension
X_train = X_train.reshape(X_train.shape[0], seq_len, -1)
# print(f'training dataset shape: {X_train.shape}')
# cut the last several segments to match batch size
num_seg = (X_train.shape[0] // batch_size) * batch_size
X_train = X_train[:num_seg, :, :]
# print(f'training dataset shape: {X_train.shape}')
if do_pca:
temp = X_train.reshape(-1, X_train.shape[2])
X_pca = PCA(n_components=15).fit(temp)
print(f'Explained ratio: {np.cumsum(X_pca.explained_variance_ratio_)}')
X_train = X_pca.transform(temp)[:, :6]
# print(f'training dataset shape: {X_train.shape}')
X_train = X_train.reshape(-1, seq_len, 6)
if single_channel:
X_train = X_train[:, :, single_channel-1]
X_train = np.expand_dims(X_train, -1)
print(f'Training dataset shape: {X_train.shape}')
train_dataset = TensorDataset(torch.from_numpy(X_train))
# test_dataset = TensorDataset(torch.from_numpy(X_train))
number_of_features = X_train.shape[2]
# num_classes = len(np.unique(y_train))
# base = np.min(y_train) # Check if data is 0-based
# if base != 0:
# y_train -= base
y_train = y_train[:num_seg, :]
print(f'Training label: {np.unique(y_train)}, shape: {y_train.shape}')
Training dataset shape: (17984, 10, 1) Training label: [-1. 0. 1. 2. 3. 4.] shape: (17984, 1)
X_test, y_test = open_data_nosplit('data', dataset='EMG', filename = "EMG7_len10")
# Data is sequential, so we need to reshape it to add a time dimension
X_test = X_test.reshape(X_test.shape[0], seq_len, -1)
# cut the last several segments to match batch size
# num_seg = (X_train.shape[0] // batch_size) * batch_size
# X_train = X_train[:num_seg, :, :]
print(f'Testing dataset shape: {X_test.shape}')
# train_dataset = TensorDataset(torch.from_numpy(X_train))
test_dataset = TensorDataset(torch.from_numpy(X_train))
# number_of_features = X_train.shape[2]
print(f'Testing label: {np.unique(y_test)}, shape: {y_test.shape}')
Testing dataset shape: (3599, 10, 15) Testing label: [-1. 0. 1. 2. 3.], shape: (3599, 1)
VRAE inherits from sklearn.base.BaseEstimator and overrides fit, transform and fit_transform functions, similar to sklearn modules
vrae = VRAE(sequence_length=seq_len,
number_of_features = number_of_features,
hidden_size = hidden_size,
hidden_layer_depth = hidden_layer_depth,
latent_length = latent_length,
batch_size = batch_size,
learning_rate = learning_rate,
n_epochs = n_epochs,
dropout_rate = dropout_rate,
optimizer = optimizer,
cuda = cuda,
print_every=print_every,
clip=clip,
max_grad_norm=max_grad_norm,
loss = loss,
block = block,
dload = dload)
/home/roton2/miniconda3/envs/emg/lib/python3.9/site-packages/torch/nn/_reduction.py:42: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.
#vrae.fit(train_dataset)
#If the model has to be saved, with the learnt parameters use:
vrae.fit(train_dataset)
Epoch: 9 Average loss: 8949.5160 Epoch: 19 Average loss: 4895.5259 Epoch: 29 Average loss: 3825.1838 Epoch: 39 Average loss: 2788.0825 Epoch: 49 Average loss: 2073.6451 Epoch: 59 Average loss: 1620.6756 Epoch: 69 Average loss: 1344.2656 Epoch: 79 Average loss: 900.9952 Epoch: 89 Average loss: 713.5740 Epoch: 99 Average loss: 431.1978 Epoch: 109 Average loss: 299.0331 Epoch: 119 Average loss: 166.3378 Epoch: 129 Average loss: 79.8649 Epoch: 139 Average loss: 54.0549 Epoch: 149 Average loss: 42.2346 Epoch: 159 Average loss: 36.3163 Epoch: 169 Average loss: 32.3547 Epoch: 179 Average loss: 30.8365 Epoch: 189 Average loss: 26.9527 Epoch: 199 Average loss: 25.4955 Epoch: 209 Average loss: 23.7469 Epoch: 219 Average loss: 22.0721 Epoch: 229 Average loss: 22.7790 Epoch: 239 Average loss: 20.4923 Epoch: 249 Average loss: 19.6221 Epoch: 259 Average loss: 18.7699 Epoch: 269 Average loss: 18.4082 Epoch: 279 Average loss: 17.8353 Epoch: 289 Average loss: 17.2369 Epoch: 299 Average loss: 16.7112 Epoch: 309 Average loss: 16.2387 Epoch: 319 Average loss: 16.3976 Epoch: 329 Average loss: 15.3285 Epoch: 339 Average loss: 15.0671 Epoch: 349 Average loss: 14.9346 Epoch: 359 Average loss: 14.8459 Epoch: 369 Average loss: 14.0796 Epoch: 379 Average loss: 13.9929 Epoch: 389 Average loss: 13.8725 Epoch: 399 Average loss: 13.9647 Epoch: 409 Average loss: 13.3512 Epoch: 419 Average loss: 13.3711 Epoch: 429 Average loss: 12.9314 Epoch: 439 Average loss: 13.2751 Epoch: 449 Average loss: 12.6651 Epoch: 459 Average loss: 12.5736 Epoch: 469 Average loss: 12.1197 Epoch: 479 Average loss: 11.9629 Epoch: 489 Average loss: 11.9430 Epoch: 499 Average loss: 12.2598 Epoch: 509 Average loss: 12.0413 Epoch: 519 Average loss: 11.5267 Epoch: 529 Average loss: 11.3207 Epoch: 539 Average loss: 11.2342 Epoch: 549 Average loss: 11.4240 Epoch: 559 Average loss: 10.9849 Epoch: 569 Average loss: 10.7775 Epoch: 579 Average loss: 10.9148 Epoch: 589 Average loss: 10.8364 Epoch: 599 Average loss: 10.5660 Epoch: 609 Average loss: 10.8777 Epoch: 619 Average loss: 10.7248 Epoch: 629 Average loss: 10.7069 Epoch: 639 Average loss: 10.3693 Epoch: 649 Average loss: 10.0192 Epoch: 659 Average loss: 10.6791 Epoch: 669 Average loss: 10.0095 Epoch: 679 Average loss: 10.0594 Epoch: 689 Average loss: 9.9344 Epoch: 699 Average loss: 10.1093 Epoch: 709 Average loss: 9.7402 Epoch: 719 Average loss: 9.8231 Epoch: 729 Average loss: 9.6576 Epoch: 739 Average loss: 9.6094 Epoch: 749 Average loss: 9.4974 Epoch: 759 Average loss: 9.3096 Epoch: 769 Average loss: 9.4864 Epoch: 779 Average loss: 9.5467 Epoch: 789 Average loss: 9.3593 Epoch: 799 Average loss: 9.3833 Epoch: 809 Average loss: 9.1737 Epoch: 819 Average loss: 8.9347 Epoch: 829 Average loss: 9.1267 Epoch: 839 Average loss: 8.9813 Epoch: 849 Average loss: 9.2919 Epoch: 859 Average loss: 9.0777 Epoch: 869 Average loss: 8.7609 Epoch: 879 Average loss: 8.7331 Epoch: 889 Average loss: 8.9821 Epoch: 899 Average loss: 8.7296 Epoch: 909 Average loss: 8.4580 Epoch: 919 Average loss: 8.5467 Epoch: 929 Average loss: 8.4667 Epoch: 939 Average loss: 8.8248 Epoch: 949 Average loss: 8.7020 Epoch: 959 Average loss: 9.0007 Epoch: 969 Average loss: 8.6505 Epoch: 979 Average loss: 8.4632 Epoch: 989 Average loss: 8.2483 Epoch: 999 Average loss: 8.6857 Epoch: 1009 Average loss: 8.4263 Epoch: 1019 Average loss: 8.2642 Epoch: 1029 Average loss: 8.2463 Epoch: 1039 Average loss: 8.2516 Epoch: 1049 Average loss: 8.1853 Epoch: 1059 Average loss: 8.0090 Epoch: 1069 Average loss: 8.4222 Epoch: 1079 Average loss: 7.9815 Epoch: 1089 Average loss: 7.9017 Epoch: 1099 Average loss: 8.1680 Epoch: 1109 Average loss: 7.9226 Epoch: 1119 Average loss: 7.9611 Epoch: 1129 Average loss: 7.9078 Epoch: 1139 Average loss: 7.8031 Epoch: 1149 Average loss: 7.9019 Epoch: 1159 Average loss: 8.0037 Epoch: 1169 Average loss: 7.7804 Epoch: 1179 Average loss: 7.5509 Epoch: 1189 Average loss: 7.6520 Epoch: 1199 Average loss: 7.5995 Epoch: 1209 Average loss: 7.4686 Epoch: 1219 Average loss: 7.4784 Epoch: 1229 Average loss: 7.5075 Epoch: 1239 Average loss: 7.3339 Epoch: 1249 Average loss: 7.6017 Epoch: 1259 Average loss: 7.5608 Epoch: 1269 Average loss: 7.4686 Epoch: 1279 Average loss: 7.4663 Epoch: 1289 Average loss: 7.4139 Epoch: 1299 Average loss: 7.3653 Epoch: 1309 Average loss: 7.2872 Epoch: 1319 Average loss: 7.3011 Epoch: 1329 Average loss: 7.0695 Epoch: 1339 Average loss: 7.1550 Epoch: 1349 Average loss: 7.1229 Epoch: 1359 Average loss: 7.0159 Epoch: 1369 Average loss: 6.9542 Epoch: 1379 Average loss: 7.1233 Epoch: 1389 Average loss: 7.1386 Epoch: 1399 Average loss: 6.9618 Epoch: 1409 Average loss: 7.0829 Epoch: 1419 Average loss: 7.0030 Epoch: 1429 Average loss: 7.1585 Epoch: 1439 Average loss: 6.8555 Epoch: 1449 Average loss: 7.0408 Epoch: 1459 Average loss: 7.0564 Epoch: 1469 Average loss: 6.8917 Epoch: 1479 Average loss: 6.8453 Epoch: 1489 Average loss: 6.8007 Epoch: 1499 Average loss: 6.7315 Epoch: 1509 Average loss: 6.8691 Epoch: 1519 Average loss: 6.6551 Epoch: 1529 Average loss: 6.6477 Epoch: 1539 Average loss: 6.6061 Epoch: 1549 Average loss: 6.6745 Epoch: 1559 Average loss: 6.6553 Epoch: 1569 Average loss: 6.6551 Epoch: 1579 Average loss: 6.4762 Epoch: 1589 Average loss: 6.5583 Epoch: 1599 Average loss: 6.4353 Epoch: 1609 Average loss: 6.5118 Epoch: 1619 Average loss: 6.4389 Epoch: 1629 Average loss: 6.4581 Epoch: 1639 Average loss: 6.3505 Epoch: 1649 Average loss: 6.2204 Epoch: 1659 Average loss: 6.4702 Epoch: 1669 Average loss: 6.3532 Epoch: 1679 Average loss: 6.2896 Epoch: 1689 Average loss: 6.2773 Epoch: 1699 Average loss: 6.2468 Epoch: 1709 Average loss: 6.0961 Epoch: 1719 Average loss: 6.1422 Epoch: 1729 Average loss: 6.1213 Epoch: 1739 Average loss: 6.0580 Epoch: 1749 Average loss: 6.1193 Epoch: 1759 Average loss: 6.1343 Epoch: 1769 Average loss: 5.9983 Epoch: 1779 Average loss: 6.0095 Epoch: 1789 Average loss: 6.1681 Epoch: 1799 Average loss: 6.0082 Epoch: 1809 Average loss: 6.1135 Epoch: 1819 Average loss: 5.9479 Epoch: 1829 Average loss: 6.0949 Epoch: 1839 Average loss: 5.8033 Epoch: 1849 Average loss: 6.0347 Epoch: 1859 Average loss: 5.9187 Epoch: 1869 Average loss: 5.9081 Epoch: 1879 Average loss: 5.8434 Epoch: 1889 Average loss: 5.8565 Epoch: 1899 Average loss: 5.8340 Epoch: 1909 Average loss: 5.9531 Epoch: 1919 Average loss: 5.7985 Epoch: 1929 Average loss: 5.8798 Epoch: 1939 Average loss: 5.7230 Epoch: 1949 Average loss: 5.6562 Epoch: 1959 Average loss: 5.7179 Epoch: 1969 Average loss: 5.7706 Epoch: 1979 Average loss: 5.6192 Epoch: 1989 Average loss: 5.7394 Epoch: 1999 Average loss: 5.5444
plt.plot(vrae.all_loss)
[<matplotlib.lines.Line2D at 0x7ff7c7e2b5b0>]
plt.plot(vrae.rec_mse)
[<matplotlib.lines.Line2D at 0x7ff7c88ad6a0>]
#If the latent vectors have to be saved, pass the parameter `save`
z_run = vrae.transform(train_dataset, save = True, filename = 'z_run_e57_c5_2000epoch.pkl')
z_run.shape
(17984, 32)
vrae.save('./vrae_e57_c5_2000epoch.pkl')
vrae.load(dload+'/vrae_3_10000epoch.pth')
with open(dload+'/z_run.pkl', 'rb') as fh:
z_run = pickle.load(fh)
reconstruction = vrae.reconstruct(train_dataset)
reconstruction = reconstruction.transpose((1,0,2))
# Show one reconstruction
num_seq = reconstruction.shape[0]
fig, axs = plt.subplots(3,5, figsize = (20, 15))
idx = np.random.choice(num_seq, 1)[0]
# idx = 250
for ii in range(number_of_features):
ori = X_train[idx, :, ii]
rec = reconstruction[idx, :, ii]
axs[ii//5, ii%5].plot(ori, color = 'black')
axs[ii//5, ii%5].plot(rec, color = 'red')
axs[ii//5, ii%5].set_title(ii+1)
fig.suptitle(f'Ori and rec of sequesce # {idx+1}', size = 20)
plt.show()
corr = []
num_seq
for ii in range(number_of_features):
temp = []
for jj in range(num_seq):
corr_temp = np.corrcoef(reconstruction[jj, :, ii], X_train[jj, :, ii])[0,1]
temp.append(corr_temp)
corr.append(temp)
# Plot multiple channels
# fig, axs = plt.subplots(number_of_features, 1, figsize = (20, number_of_features*4))
# for ii in range(number_of_features):
# axs[ii].plot(corr[ii])
# axs[ii].set_title(f'Channel {ii+1}, mean corr = {np.mean(corr[ii])}')
# axs[ii].set_ylim([-0.8, 1.1])
# axs[ii].set_xlim([0, 2000])
# Plot only one channel
fig, axs = plt.subplots(number_of_features, 1, figsize = (20, 5))
axs.plot(corr[0])
axs.set_title(f'Channel {single_channel}, mean corr = {np.mean(corr[0])}')
axs.set_ylim([-0.8, 1.1])
# axs.set_xlim([0, 2000])
(-0.8, 1.1)
from sklearn.metrics import mean_squared_error as mse
mse_all = []
mean_all = []
for ii in range(number_of_features):
mse_channel = []
mean_channel = []
for jj in range(X_train.shape[0]):
ori = X_train[jj, :, ii]
rec = reconstruction[jj, :, ii]
mean_seq = np.mean(ori)
mse_seq = mse(ori, rec)
mse_channel.append(mse_seq)
mean_channel.append(mean_seq)
mse_channel = np.array(mse_channel)
mean_channel = np.array(mean_channel)
#print(ii)
mse_all.append(mse_channel)
mean_all.append(mean_channel)
times = np.max(mse_all)
# # Plot multiple channels
# fig, axs = plt.subplots(number_of_features, 1, figsize = (20, number_of_features*6))
# for ii in range(number_of_features):
# temp = np.array(corr[ii])*times/3
# axs[ii].plot(temp, color = 'r', label = 'corr')
# axs[ii].plot(mse_all[ii], color = 'b', label = 'mse')
# axs[ii].plot(mean_all[ii], color = 'k', label = 'mean')
# axs[ii].set_title(f'Channel {ii+1}')
# axs[ii].set_xlim([8000, 12000])
# axs[ii].legend()
# Plot only one channel
fig, axs = plt.subplots(number_of_features, 1, figsize = (20, 5))
axs.plot(corr[0][:nn], color = 'r', label = 'corr')
axs.plot(mse_all[0][:nn], color = 'b', label = 'mse')
axs.plot(mean_all[0][:nn], color = 'k', label = 'mean')
axs.set_title(f'Channel {single_channel}')
axs.legend()
# axs.set_xlim([0, 2000])
<matplotlib.legend.Legend at 0x7ff6e7b27d60>
bhvs = {'crawling': np.array([0]),
'high picking treats': np.array([1]),
'low picking treats': np.array([2]),
'pg': np.array([3]),
'sitting still': np.array([4]),
'grooming': np.array([5]),
'no_behavior': np.array([-1])}
inv_bhvs = {int(v): k for k, v in bhvs.items()}
# pick one in four sequence for visulization
z_run_down = z_run[::4, :]
# z_run_pca = TruncatedSVD(n_components=2).fit_transform(z_run_down)
# z_run_pca = PCA(n_components=2).fit_transform(z_run_down)
# z_run_tsne = TSNE(perplexity=80, min_grad_norm=1E-12, n_iter=3000).fit_transform(z_run_down)
all_colors = ['b','g','r','c','m','y','darkgrey']
label = y_train[::4, :]
fig, axs = plt.subplots(1,2, figsize=(20,10))
for ii in np.unique(label):
ii = int(ii)
x_pca = z_run_pca[:,0].reshape(-1,1)[label == ii]
y_pca = z_run_pca[:,1].reshape(-1,1)[label == ii]
axs[0].scatter(x_pca, y_pca, c=all_colors[ii], marker='.', label = inv_bhvs[ii], linewidths=None)
x_tsne = z_run_tsne[:,0].reshape(-1,1)[label == ii]
y_tsne = z_run_tsne[:,1].reshape(-1,1)[label == ii]
axs[1].scatter(x_tsne, y_tsne, c=all_colors[ii], marker='.', label = inv_bhvs[ii], linewidths=None)
axs[0].set_title('PCA on z_run')
axs[1].set_title('tSNE on z_run')
axs[0].legend()
axs[1].legend()
plt.show()
# # Create clusters.annot
# from sklearn.mixture import GaussianMixture
# # Predict cluster assignments
# gm = GaussianMixture(n_components=5, random_state=0).fit(z_run)
# clusters = gm.predict(z_run)
# # Number of seconds in each sequence
# filt_time_step = 0.025
# num_secs_seq = sequence_length * filt_time_step
# end_time = len(z_run) * num_secs_seq + num_secs_seq
# # Print head of the file
# f = open ('Pop01-06_18_2021.annot','w')
# # write the header--------------------
# f.write('Bento annotation file\n')
# f.write('Movie file(s): {}\n\n'.format('Pop_20210618_cage_C1_01.avi'))
# f.write('{0} {1}\n'.format('Stimulus name:',''))
# f.write('{0} {1}\n'.format('Annotation start frame:',1))
# f.write('{0} {1}\n'.format('Annotation stop frame:', 26994))
# f.write('{0} {1}\n'.format('Annotation framerate:', 30))
# f.write('\n{0}\n'.format('List of channels:'))
# channels = ['cluster_num']
# for item in channels:
# f.write('{0}\n'.format(item))
# f.write('\n');
# f.write('{0}\n'.format('List of annotations:'))
# clust_names = ['cluster_{}'.format(str(num)) for num in set(clusters)]
# labels = clust_names
# # labels = [item.replace(' ','_') for item in labels]
# for item in labels:
# f.write('{0}\n'.format(item))
# f.write('\n')
# # now write the contents---------------
# for ch in channels:
# f.write('{0}----------\n'.format(ch))
# for beh in labels:
# f.write('>{0}\n'.format(beh))
# f.write('{0}\t {1}\t {2} \n'.format('Start','Stop','Duration'))
# idxs = np.where(clusters == int(beh.split('_')[-1]))[0]
# for hit in idxs:
# start_time = hit * num_secs_seq/2
# end_time = start_time + num_secs_seq
# f.write('{0}\t{1}\t{2}\n'.format(start_time, end_time, num_secs_seq))
# f.write('\n')
# f.write('\n')
# f.close()